Initialization

Libraries

library(BSgenome.Hsapiens.UCSC.hg38)
library(plyranges)
library(tidyverse)
library(magrittr)
library(cowplot)

Parameters

WIN_SIZE = 1000
RADIUS = 300
EPSILON = 30
TPM = 5
N_CTS_LOW = 10
N_CTS_HIGH = 80

COLOR_VALS_CPA=c(TGTA="#73ADD6", AWTAAA="#358C44", TKTKTK="#F36B4D")
COLOR_VALS_CPA_PA=c(TGTA="#73ADD6", AWTAAA="#358C44", TKTKTK="#F36B4D", AAAAAA="#888888")
COLOR_VALS_EXTRA=c(TGTA="#73ADD6", AWTAAA="#358C44", TKTKTK="#F36B4D", CAATTA="orchid4")

FILE_UTROME = sprintf("data/granges/utrome_gr_txs.e%d.t%d.gc39.pas3.f0.9999.w500.Rds", EPSILON, TPM)
FILE_BED = sprintf("data/bed/celltypes/celltypes.e%d.t%d.bed.gz", EPSILON, TPM)
FILE_BED_UNLIKELY = sprintf("data/bed/cleavage-sites/utrome.unlikely.e%d.t%d.gc39.pas3.f0.9999.bed.gz",
                            EPSILON, TPM)

Functions

get_centered_motif_sites <- function (motif, seqs) {
  seqs %>% 
    vmatchPattern(pattern=motif, fixed=FALSE) %>%
    unlist %>% as.data.frame %>%
    mutate(motif=motif,
           start=start - WIN_SIZE/2,
           end=end - WIN_SIZE/2,
           center=(end + start)/2) %>%
    select(motif, center, start, end)
}

## plot density from motif positions data.frame
plot_motif_density <- function (df_motifs, RADIUS=300, title="UTRome",
                                col_values=COLOR_VALS_CPA) {
  df_motifs %>%
    ggplot(aes(x=center, color=motif)) +
    stat_density(aes(y=..scaled..), geom='line', position='identity', 
                 size=1.5, alpha=0.9) +
    geom_hline(yintercept=0) +
    geom_vline(xintercept=0, linetype='dashed', color='black') +
    coord_cartesian(xlim=c(-RADIUS, RADIUS)) +
    scale_x_continuous(breaks=seq(-RADIUS, RADIUS, 100), 
                       limits=c(-RADIUS - 100, RADIUS+100)) +
    scale_color_manual(values=col_values) +
    labs(x=sprintf("Distance from Cleavage Site (%s)", title),
         y="Relative Density", color="Motif") +
    guides(color=guide_legend(override.aes=list(alpha=1, size=3))) +
    theme_minimal_vgrid()
}

## plot count from motif positions data.frame
plot_motif_count <- function (df_motifs, RADIUS=300, title="UTRome",
                                col_values=COLOR_VALS_CPA) {
  df_motifs %>%
    ggplot(aes(x=center, color=motif)) +
    stat_density(aes(y=..count..), geom='line', position='identity', 
                 size=1.5, alpha=0.9) +
    geom_hline(yintercept=0) +
    geom_vline(xintercept=0, linetype='dashed', color='black') +
    coord_cartesian(xlim=c(-RADIUS, RADIUS)) +
    scale_x_continuous(breaks=seq(-RADIUS, RADIUS, 100), 
                       limits=c(-RADIUS - 100, RADIUS+100)) +
    scale_color_manual(values=col_values) +
    labs(x=sprintf("Distance from Cleavage Site (%s)", title),
         y="Count", color="Motif") +
    guides(color=guide_legend(override.aes=list(alpha=1, size=3))) +
    theme_minimal_vgrid()
}

## plot unscaled density from motif positions data.frame
plot_motif_density_ns <- function (df_motifs, RADIUS=300, title="UTRome",
                                col_values=COLOR_VALS_CPA) {
  df_motifs %>%
    ggplot(aes(x=center, color=motif)) +
    stat_density(geom='line', size=1.5, alpha=0.9, position="identity") +
    geom_hline(yintercept=0) +
    geom_vline(xintercept=0, linetype='dashed', color='black') +
    coord_cartesian(xlim=c(-RADIUS, RADIUS)) +
    scale_x_continuous(breaks=seq(-RADIUS, RADIUS, 100), 
                       limits=c(-RADIUS - 100, RADIUS+100)) +
    scale_color_manual(values=col_values) +
    labs(x=sprintf("Distance from Cleavage Site (%s)", title),
         y="Density", color="Motif") +
    guides(color=guide_legend(override.aes=list(alpha=1, size=3))) +
    theme_minimal_vgrid()
}

Load Data

Cleavage Sites by Cell Type

gr_sites <- read_bed(FILE_BED) %>% 
    `seqlevelsStyle<-`("UCSC") %>%
    keepStandardChromosomes(pruning.mode="coarse") %>%
    anchor_center() %>%
    mutate(width=EPSILON)

Human UTRome Transcripts

## Load all transcripts
gr_txs <- readRDS(FILE_UTROME)

## focus on cleavage sites
gr_cleavage <- gr_txs %>%
    anchor_3p %>%
    mutate(n_celltypes=count_overlaps_directed(mutate(., width=0), gr_sites)) %>%
    mutate(width=WIN_SIZE) %>%
    shift_downstream(WIN_SIZE/2) %>%
    mutate(origin=ifelse(is_novel, "UTRome", "GENCODE"),
           origin_ud=case_when(
               !is_novel ~ "GENCODE",
               str_detect(transcript_id, "UTR-") ~ "upstream",
               str_detect(transcript_id, "UTR+") ~ "downstream"
           ))

Cleavage Sites Marked as Internal Priming

gr_ip <- read_bed(FILE_BED_UNLIKELY) %>%
  anchor_3p %>%
  mutate(width=WIN_SIZE) %>%
  shift_downstream(WIN_SIZE/2)

Subgroups

gr_single <- filter(gr_cleavage, utr_type == "single")
gr_ipa <- filter(gr_cleavage, is_ipa)
gr_multi_tandem <- filter(gr_cleavage, utr_type == 'multi', !is_ipa)

gr_gencode <- filter(gr_cleavage, !is_novel)
gr_novel <- filter(gr_cleavage, is_novel)

gr_proximal_gc <- filter(gr_cleavage, utr_type == 'multi', is_proximal, !is_novel)
gr_nondistal_gc <- filter(gr_cleavage, utr_type == 'multi', !is_distal, !is_novel)
gr_distal_gc <- filter(gr_cleavage, utr_type == 'multi', is_distal, !is_novel)

gr_proximal_novel <- filter(gr_cleavage, utr_type == 'multi', is_proximal, is_novel)
gr_nondistal_novel <- filter(gr_cleavage, utr_type == 'multi', !is_distal, is_novel)
gr_distal_novel <- filter(gr_cleavage, utr_type == 'multi', is_distal, is_novel)

## only GENCODE
gr_ctnone_gc <- filter(gr_cleavage, !is_novel, n_celltypes == 0)
gr_common <- filter(gr_cleavage, !is_novel, n_celltypes > 0)

gr_ctlow_gc <- filter(gr_cleavage, !is_novel, n_celltypes > 0, n_celltypes < N_CTS_LOW)
gr_ctmid_gc <- filter(gr_cleavage, !is_novel, n_celltypes >= N_CTS_LOW, n_celltypes < N_CTS_HIGH)
gr_cthigh_gc <- filter(gr_cleavage, !is_novel, n_celltypes >= N_CTS_HIGH)

gr_ctlow_novel <- filter(gr_cleavage, is_novel, n_celltypes < N_CTS_LOW)
gr_ctmid_novel <- filter(gr_cleavage, is_novel, n_celltypes >= N_CTS_LOW, n_celltypes < N_CTS_HIGH)
gr_cthigh_novel <- filter(gr_cleavage, is_novel, n_celltypes >= N_CTS_HIGH)

Plots

All Sites (79070 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_cleavage)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="All")

Single-UTR Genes (3747 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_single)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Single UTR Genes")

Intronic Cleavage Sites (18049 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ipa)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Intronic Sites")

Tandem Cleavage Sites (57423 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_multi_tandem)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Tandem Sites")

GENCODE

All Cleavage Sites (56548 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_gencode)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="GENCODE Sites")

Proximal Cleavage Sites (10573 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_proximal_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="GENCODE Proximal Sites")

Non-Distal Cleavage Sites (39828 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_nondistal_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="GENCODE Non-Distal Sites")

Distal Cleavage Sites (12973 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_distal_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="GENCODE Distal Sites")

Cell Types High (7380 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_cthigh_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="GENCODE - Cell Types High")

Cell Types Midrange (13671 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ctmid_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="GENCODE - Cell Types Midrange")

Cell Types Low (10539 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ctlow_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="GENCODE - Cell Types Low")

No Cell Types Supporting (24958 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ctnone_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="GENCODE - No Supporting Cell Types")

Export

ggsave("img/sq/sup1c-motif-density-gencode-hcl.pdf", 
       width=8, height=6, dpi=300)
## Warning: Removed 68753 rows containing non-finite values (`stat_density()`).
seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ctnone_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density_ns(title="GENCODE - No Supporting Cell Types")

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ctnone_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_count(title="GENCODE - No Supporting Cell Types")

Common Cleavage Sites (31590 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_common)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="GENCODE + HCL")

Export

ggsave("img/sq/sup1c-motif-density-common-hcl.pdf", 
       width=8, height=6, dpi=300)
## Warning: Removed 85062 rows containing non-finite values (`stat_density()`).
seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_common)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density_ns(title="GENCODE + HCL")

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_common)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_count(title="GENCODE + HCL")

Novel Cleavage Sites

All Cleavage Sites (22522 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_novel)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Novel Sites")

Export

ggsave("img/sq/sup1c-motif-density-utrome-hcl.pdf", 
       width=8, height=6, dpi=300)
## Warning: Removed 77133 rows containing non-finite values (`stat_density()`).
seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_novel)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density_ns(title="Novel Sites")

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_novel)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_count(title="Novel Sites")

Proximal Cleavage Sites (4543 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_proximal_novel)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Novel Proximal Sites")

Non-Distal Cleavage Sites (20379 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_nondistal_novel)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Novel Non-Distal Sites")

Distal Cleavage Sites (2143 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_distal_novel)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Novel Distal Sites")

Cell Types High (1397 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_cthigh_novel)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Novel - Cell Types High")

Cell Types Midrange (6191 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ctmid_novel)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Novel - Cell Types Midrange")

Cell Types Low (14934 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ctlow_novel)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Novel - Cell Types Low")

Internal Priming Sites (70462 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ip)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)

rbind(df_tgta, df_pas, df_tktktk) %>%
  plot_motif_density(title="Internal Priming Sites")

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_ip)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)
df_aaaaaa <- get_centered_motif_sites(motif="AAAAAA", seqs)

rbind(df_tgta, df_pas, df_tktktk, df_aaaaaa) %>%
  plot_motif_density(title="Internal Priming Sites", col_values=COLOR_VALS_CPA_PA)

Combined Motif Analysis

UTRome vs GENCODE

grs_cleavage <- gr_cleavage %>% split(.$origin)

seqs_cleavage <- lapply(grs_cleavage, . %>% getSeq(x=BSgenome.Hsapiens.UCSC.hg38))

lapply_get_centered_motif_sites <- function(motif, seqs=seqs_cleavage) {
  lapply(seqs, . %>% get_centered_motif_sites(motif=motif)) %>% 
    do.call(what=rbind) %>%
    mutate(origin=str_extract(rownames(.), "^[^.]+")) %>%
    `rownames<-`(NULL)
}

df_tgta <- lapply_get_centered_motif_sites(motif="TGTA") %>%
  mutate(origin=factor(origin, levels=c("GENCODE", "UTRome")))
df_pas <- lapply_get_centered_motif_sites(motif="AWTAAA") %>%
  mutate(origin=factor(origin, levels=c("GENCODE", "UTRome")))
df_tktktk <- lapply_get_centered_motif_sites(motif="TKTKTK") %>%
  mutate(origin=factor(origin, levels=c("GENCODE", "UTRome")))

df_combined <- rbind(df_tgta, df_pas, df_tktktk)

Plot

df_combined %>%
  ggplot(aes(x=center, color=motif, linetype=origin)) +
  stat_density(aes(y=..scaled..), geom='line', position='identity', 
               size=1.0, alpha=0.9) +
  geom_hline(yintercept=0) +
  geom_vline(xintercept=0, linetype='dashed', color='black') +
  coord_cartesian(xlim=c(-RADIUS, RADIUS)) +
  scale_x_continuous(breaks=seq(-RADIUS, RADIUS, 100), 
                     limits=c(-RADIUS - 100, RADIUS+100)) +
  scale_color_manual(values=c(TGTA="#73ADD6", AWTAAA="#358C44", TKTKTK="#F36B4D")) +
  scale_linetype_manual(values=c("GENCODE"="solid", "UTRome"="dashed")) +
  labs(x="Distance from RefSeq Cleavage Site",
       y="Relative Density", color="Motif", linetype="Annotation") +
  guides(color=guide_legend(override.aes=list(alpha=1, size=3))) +
  theme_minimal_vgrid()

GENCODE vs Upstream vs Downstream

grs_cleavage_ud <- gr_cleavage %>% split(.$origin_ud)

seqs_cleavage_ud <- lapply(grs_cleavage_ud, . %>% getSeq(x=BSgenome.Hsapiens.UCSC.hg38))

lapply_get_centered_motif_sites <- function(motif, seqs=seqs_cleavage_ud) {
  lapply(seqs, . %>% get_centered_motif_sites(motif=motif)) %>% 
    do.call(what=rbind) %>%
    mutate(origin=str_extract(rownames(.), "^[^.]+")) %>%
    `rownames<-`(NULL)
}

df_tgta_ud <- lapply_get_centered_motif_sites(motif="TGTA") %>%
  mutate(origin=factor(origin, levels=c("GENCODE", "upstream", "downstream")))
df_pas_ud <- lapply_get_centered_motif_sites(motif="AWTAAA") %>%
  mutate(origin=factor(origin, levels=c("GENCODE", "upstream", "downstream")))
df_tktktk_ud <- lapply_get_centered_motif_sites(motif="TKTKTK") %>%
  mutate(origin=factor(origin, levels=c("GENCODE", "upstream", "downstream")))

df_combined_ud <- rbind(df_tgta_ud, df_pas_ud, df_tktktk_ud)

Plot All

df_combined_ud %>%
  ggplot(aes(x=center, color=motif, linetype=origin)) +
  stat_density(aes(y=..scaled..), geom='line', position='identity', 
               size=1.0, alpha=0.9) +
  geom_hline(yintercept=0) +
  geom_vline(xintercept=0, linetype='dashed', color='black') +
  coord_cartesian(xlim=c(-RADIUS, RADIUS)) +
  scale_x_continuous(breaks=seq(-RADIUS, RADIUS, 100), 
                     limits=c(-RADIUS - 100, RADIUS+100)) +
  scale_color_manual(values=c(TGTA="#73ADD6", AWTAAA="#358C44", TKTKTK="#F36B4D")) +
  scale_linetype_manual(values=c("GENCODE"="solid", "upstream"="dotted", "downstream"="dashed")) +
  labs(x="Distance from RefSeq Cleavage Site",
       y="Relative Density", color="Motif", linetype="Annotation") +
  guides(color=guide_legend(override.aes=list(alpha=1, size=3))) +
  theme_minimal_vgrid()

Plot Individual

TGTA

df_tgta_ud %>%
  ggplot(aes(x=center, color=motif, linetype=origin)) +
  stat_density(aes(y=..scaled..), geom='line', position='identity', 
               size=1.0, alpha=0.9) +
  geom_hline(yintercept=0) +
  geom_vline(xintercept=0, linetype='dashed', color='black') +
  coord_cartesian(xlim=c(-RADIUS, RADIUS)) +
  scale_x_continuous(breaks=seq(-RADIUS, RADIUS, 100), 
                     limits=c(-RADIUS - 100, RADIUS+100)) +
  scale_color_manual(values=c(TGTA="#73ADD6", AWTAAA="#358C44", TKTKTK="#F36B4D")) +
  scale_linetype_manual(values=c("GENCODE"="solid", "upstream"="dotted", "downstream"="dashed")) +
  labs(x="Distance from RefSeq Cleavage Site",
       y="Relative Density", color="Motif", linetype="Annotation") +
  guides(color=FALSE) +
  theme_minimal_vgrid()

AWTAAA

df_pas_ud %>%
  ggplot(aes(x=center, color=motif, linetype=origin)) +
  stat_density(aes(y=..scaled..), geom='line', position='identity', 
               size=1.0, alpha=0.9) +
  geom_hline(yintercept=0) +
  geom_vline(xintercept=0, linetype='dashed', color='black') +
  coord_cartesian(xlim=c(-RADIUS, RADIUS)) +
  scale_x_continuous(breaks=seq(-RADIUS, RADIUS, 100), 
                     limits=c(-RADIUS - 100, RADIUS+100)) +
  scale_color_manual(values=c(TGTA="#73ADD6", AWTAAA="#358C44", TKTKTK="#F36B4D")) +
  scale_linetype_manual(values=c("GENCODE"="solid", "upstream"="dotted", "downstream"="dashed")) +
  labs(x="Distance from RefSeq Cleavage Site",
       y="Relative Density", color="Motif", linetype="Annotation") +
  guides(color=FALSE) +
  theme_minimal_vgrid()

TKTKTK

df_tktktk_ud %>%
  ggplot(aes(x=center, color=motif, linetype=origin)) +
  stat_density(aes(y=..scaled..), geom='line', position='identity', 
               size=1.0, alpha=0.9) +
  geom_hline(yintercept=0) +
  geom_vline(xintercept=0, linetype='dashed', color='black') +
  coord_cartesian(xlim=c(-RADIUS, RADIUS)) +
  scale_x_continuous(breaks=seq(-RADIUS, RADIUS, 100), 
                     limits=c(-RADIUS - 100, RADIUS+100)) +
  scale_color_manual(values=c(TGTA="#73ADD6", AWTAAA="#358C44", TKTKTK="#F36B4D")) +
  scale_linetype_manual(values=c("GENCODE"="solid", "upstream"="dotted", "downstream"="dashed")) +
  labs(x="Distance from RefSeq Cleavage Site",
       y="Relative Density", color="Motif", linetype="Annotation") +
  guides(color=FALSE) +
  theme_minimal_vgrid()

STOP-Codon Readthrough-Prevention Motif

All Sites (79070 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_cleavage)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)
df_caatta <- get_centered_motif_sites(motif="CAATTA", seqs)

rbind(df_tgta, df_pas, df_tktktk, df_caatta) %>%
  plot_motif_density(title="All", col_values=COLOR_VALS_EXTRA)

Single-UTR Genes (3747 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_single)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)
df_caatta <- get_centered_motif_sites(motif="CAATTA", seqs)

rbind(df_tgta, df_pas, df_tktktk, df_caatta) %>%
  plot_motif_density(title="Single-UTR Genes", col_values=COLOR_VALS_EXTRA)

GENCODE Only

Proximal Cleavage Sites (10573 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_proximal_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)
df_caatta <- get_centered_motif_sites(motif="CAATTA", seqs)

rbind(df_tgta, df_pas, df_tktktk, df_caatta) %>%
  plot_motif_density(title="GENCODE Proximal Sites", col_values=COLOR_VALS_EXTRA)

Non-Distal Cleavage Sites (39828 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_nondistal_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)
df_caatta <- get_centered_motif_sites(motif="CAATTA", seqs)

rbind(df_tgta, df_pas, df_tktktk, df_caatta) %>%
  plot_motif_density(title="GENCODE Non-Distal Sites", col_values=COLOR_VALS_EXTRA)

Distal Cleavage Sites (12973 sites)

seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, gr_distal_gc)

df_tgta <- get_centered_motif_sites("TGTA", seqs)
df_pas <- get_centered_motif_sites(motif="AWTAAA", seqs)
df_tktktk <- get_centered_motif_sites(motif="TKTKTK", seqs)
df_caatta <- get_centered_motif_sites(motif="CAATTA", seqs)

rbind(df_tgta, df_pas, df_tktktk, df_caatta) %>%
  plot_motif_density(title="GENCODE Distal Sites", col_values=COLOR_VALS_EXTRA)


Session Info

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS/LAPACK: /Users/mfansler/miniconda3/envs/bioc_3_16/lib/libopenblasp-r0.3.21.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] cowplot_1.1.1                     magrittr_2.0.3                   
##  [3] lubridate_1.9.2                   forcats_1.0.0                    
##  [5] stringr_1.5.0                     dplyr_1.1.0                      
##  [7] purrr_1.0.1                       readr_2.1.4                      
##  [9] tidyr_1.3.0                       tibble_3.1.8                     
## [11] ggplot2_3.4.1                     tidyverse_2.0.0                  
## [13] plyranges_1.18.0                  BSgenome.Hsapiens.UCSC.hg38_1.4.5
## [15] BSgenome_1.66.3                   rtracklayer_1.58.0               
## [17] Biostrings_2.66.0                 XVector_0.38.0                   
## [19] GenomicRanges_1.50.0              GenomeInfoDb_1.34.9              
## [21] IRanges_2.32.0                    S4Vectors_0.36.0                 
## [23] BiocGenerics_0.44.0              
## 
## loaded via a namespace (and not attached):
##  [1] MatrixGenerics_1.10.0       Biobase_2.58.0             
##  [3] sass_0.4.2                  jsonlite_1.8.4             
##  [5] bslib_0.4.1                 highr_0.9                  
##  [7] GenomeInfoDbData_1.2.9      Rsamtools_2.14.0           
##  [9] yaml_2.3.6                  pillar_1.8.1               
## [11] lattice_0.20-45             glue_1.6.2                 
## [13] digest_0.6.30               colorspace_2.0-3           
## [15] htmltools_0.5.4             Matrix_1.5-3               
## [17] XML_3.99-0.12               pkgconfig_2.0.3            
## [19] zlibbioc_1.44.0             scales_1.2.1               
## [21] tzdb_0.3.0                  BiocParallel_1.32.0        
## [23] timechange_0.2.0            farver_2.1.1               
## [25] generics_0.1.3              ellipsis_0.3.2             
## [27] cachem_1.0.6                withr_2.5.0                
## [29] SummarizedExperiment_1.28.0 cli_3.6.0                  
## [31] crayon_1.5.2                evaluate_0.18              
## [33] fansi_1.0.3                 textshaping_0.3.6          
## [35] tools_4.2.2                 hms_1.1.2                  
## [37] BiocIO_1.8.0                lifecycle_1.0.3            
## [39] matrixStats_0.62.0          munsell_0.5.0              
## [41] DelayedArray_0.24.0         compiler_4.2.2             
## [43] jquerylib_0.1.4             systemfonts_1.0.4          
## [45] rlang_1.1.0                 grid_4.2.2                 
## [47] RCurl_1.98-1.9              rstudioapi_0.14            
## [49] rjson_0.2.21                labeling_0.4.2             
## [51] bitops_1.0-7                rmarkdown_2.18             
## [53] restfulr_0.0.15             gtable_0.3.1               
## [55] codetools_0.2-18            R6_2.5.1                   
## [57] GenomicAlignments_1.34.0    knitr_1.40                 
## [59] fastmap_1.1.0               utf8_1.2.2                 
## [61] ragg_1.2.5                  stringi_1.7.8              
## [63] parallel_4.2.2              vctrs_0.6.1                
## [65] tidyselect_1.2.0            xfun_0.34